  
*******************************************************************************
*******************************************************************************
* This do-file is creating data for simulating balance accross covariates 
*******************************************************************************
*******************************************************************************

local year_today  "2024"
local month_today "08"

local num_applications = `1' * 3 
local num_appl_per_job 3

set obs `num_applications'

* Parameters for randomized application variables
local num_occ 18
local occupation1 "mekaniker"
local occupation2 "dataspecialist"
local occupation3 "fordonsförare"
local occupation4 "redovisningsekonom"
local occupation5 "företagssäljare"
local occupation6 "kock"
local occupation7 "restaurangbiträde"
local occupation8 "ekonomiassistent"
local occupation9 "personaladministratör"
local occupation10 "kontorsassistent"
local occupation11 "lagerpersonal"
local occupation12 "kundtjänstpersonal" 
local occupation13 "barnskötare"
local occupation14 "personligassistent"
local occupation15 "undersköterska"
local occupation16 "butikssäljare"
local occupation17 "lokalvårdare"
local occupation18 "supporttekniker"

local occupations "mekaniker dataspecialist fordonsförare redovisningsekonom företagssäljare kock restaurangbiträde ekonomiassistent personaladministratör kontorsassistent lagerpersonal kundtjänstpersonal barnskötare personligassistent undersköterska butikssäljare lokalvårdare supporttekniker"
local male_occ "mekaniker dataspecialist fordonsförare företagssäljare kock lagerpersonal supporttekniker"
local female_occ "redovisningsekonom restaurangbiträde ekonomiassistent personaladministratör kontorsassistent kundtjänstpersonal barnskötare personligassistent undersköterska butikssäljare lokalvårdare"
local high_edu_occ "dataspecialist redovisningsekonom supporttekniker"

* Parameters for language levels
local num_language_levels 3
local language1 "S0"
local language2 "S1"
local language3 "S2"
local language4 "S3"
local language_levels "S0 S1 S2 S3"

local share_noneurope .6667

local share_newly_arrived .5

local num_countries 4

local country_euro1 "Polen"
local country_euro2 "Rumänien"
local country_euro3 "Storbritannien"
local country_euro4 "Tyskland"
local countrys_euro "Polen Rumänien Storbritannien Tyskland"

local country_noneuro1 "Afghanistan"
local country_noneuro2 "Iran"
local country_noneuro3 "Irak"
local country_noneuro4 "Syrien"
local countries_noneuro "Afghanistan Iran Irak Syrien"

local num_addresses 3
local address1_sthlm = "Gräsandsvägen 31, 123 53, Farsta"
local address2_sthlm = "Jyllandsgatan 307, 16447, KISTA"
local address3_sthlm = "Slättåkragränd 25, 12572 Älvsjö"

local address1_gbg = "Länsmansgatan 8, 431 60, Mölndal"
local address2_gbg = "Fyrktorget 4, 41483, GÖTEBORG"
local address3_gbg = "Jägaregatan 7D, 41702 Göteborg"

local address1_mlm = "Koristgatan 22, 215 84, Malmö"
local address2_mlm = "Teknikergatan 27, 21568, MALMÖ"
local address3_mlm = "Nikolaigatan 12, 21421 Malmö"

local city_euro1 "Warszawa"
local city_euro2 "Bukarest"
local city_euro3 "London"
local city_euro4 "Berlin"

local city_noneuro1 "Kabul"
local city_noneuro2 "Teheran"
local city_noneuro3 "Bagdad"
local city_noneuro4 "Damaskus"

local first_euro_male_name1 "Jakub"
local first_euro_male_name2 "Andrei"
local first_euro_male_name3 "Jack"
local first_euro_male_name4 "Otto"

local first_euro_female_name1 "Zuzanna"
local first_euro_female_name2 "Elena"
local first_euro_female_name3 "Emily"
local first_euro_female_name4 "Heidi"

local last_euro_name1 "Kowalski"
local last_euro_name2 "Popescu"
local last_euro_name3 "Wilson"
local last_euro_name4 "Müller"

local first_noneuro_male_name1 "Amir"
local first_noneuro_male_name2 "Ali"
local first_noneuro_male_name3 "Mustafa"
local first_noneuro_male_name4 "Mohammed"

local first_noneuro_female_name1 "Jamila"
local first_noneuro_female_name2 "Fateme"
local first_noneuro_female_name3 "Yasmin"
local first_noneuro_female_name4 "Laila"

local last_noneuro_name1 "Hosseini"
local last_noneuro_name2 "Mahmoodi"
local last_noneuro_name3 "Mansour"
local last_noneuro_name4 "Aboud"

local phone_euro_male1 "073-672 87 69"
local phone_euro_male2 "072-835 71 76"
local phone_euro_male3 "072-864 25 17"
local phone_euro_male4 "073-994 15 58"

local phone_noneuro_male1 "073-969 94 18"
local phone_noneuro_male2 "072-276 34 76"
local phone_noneuro_male3 "073-755 49 17"
local phone_noneuro_male4 "073-957 91 88"

local email_euro_male1 "jakub.c.kowalski@gmail.com"
local email_euro_male2 "andrei.popescu573@gmail.com"
local email_euro_male3 "wilson.b.jack@gmail.com"
local email_euro_male4 "otto.muller.b@gmail.com"

local email_noneuro_male1 "amir.m.hosseini53@gmail.com"
local email_noneuro_male2 "ali.v.mahmoodi@gmail.com"
local email_noneuro_male3 "mustafa.mansour107@gmail.com"
local email_noneuro_male4 "mohammed.o.aboud@gmail.com"

local phone_euro_female1 "073-951 76 70"
local phone_euro_female2 "073-999 25 64"
local phone_euro_female3 "076-594 34 47"
local phone_euro_female4 "072-853 98 36"

local phone_noneuro_female1 "073-904 17 56"
local phone_noneuro_female2 "072-024 29 72"
local phone_noneuro_female3 "072-835 26 97"
local phone_noneuro_female4 "076-415 12 18"

local email_euro_female1 "zuzannakowalski0@gmail.com"
local email_euro_female2 "elena.e.popescu3@gmail.com"
local email_euro_female3 "emily.e.wilson217@gmail.com"
local email_euro_female4 "heidimuller716@gmail.com"

local email_noneuro_female1 "jamilahosseini83@gmail.com"
local email_noneuro_female2 "fateme.a.mahmoodi@gmail.com"
local email_noneuro_female3 "yyasmin.mansour@gmail.com"
local email_noneuro_female4 "laila.m.aboud@gmail.com"

*** Set random variables ***
gen rand  = ceil(`num_occ' * uniform())
gen occ = ""
gen occ_no = -1
forvalues i = 1(1)`num_occ' {
 replace occ = "`occupation`i''" if `i' == rand
 replace occ_no = `i' if `i' == rand
}
drop rand

gen male_occ = 1
foreach tmp_occ in `female_occ' {
 replace male_occ = 0 if occ == "`tmp_occ'"
}

gen high_edu = 0
foreach tmp_occ in `high_edu_occ' {
 replace high_edu = 1 if occ == "`tmp_occ'"
}

gen gender = ""
replace gender = "Female" if male_occ == 0
replace gender = "Male" if male_occ == 1

sort occ
gen app_row = _n

sort app_row
gen group = ceil(app_row / `num_appl_per_job')
bysort occ group: egen count_group = count(group) 
drop if count_group != `num_appl_per_job'
drop count_group

gen app_type_tmp =   uniform()
bysort group: egen first = min(app_type_tmp)
sort group app_type_tmp 
gen appl_type = 1 if app_type_tmp == first
replace appl_type = 2 if appl_type[_n-1] == 1
replace appl_type = 3 if appl_type[_n-1] == 2
drop app_type_tmp first

gen order_tmp =   uniform()
bysort group: egen first = min(order_tmp)
sort group order_tmp 
gen appl_order = 1 if order_tmp == first
replace appl_order = 2 if appl_order[_n-1] == 1
replace appl_order = 3 if appl_order[_n-1] == 2
drop order_tmp first

sort group app_row
egen first_app_row = min(app_row), by(group)
gen city_tmp1 = floor(6 * uniform()  + 1) if first_app_row == app_row
egen city_tmp2 = max(city_tmp1), by(group)  
gen city = "Stockholm" if city_tmp2 <= 3
replace city = "Malmö" if city_tmp2 == 6
replace city = "Göteborg" if city == ""
drop first_app_row city_tmp1 city_tmp2

sort group appl_order

gen G_tmp = uniform() if appl_order == 1
gen G = "G1" if G_tmp <= .50
replace G = "G2" if 0.50 < G_tmp & G_tmp <= 1
replace G = "G3" if 1 < G_tmp & G_tmp != .
replace G = G[_n-1] if G == "" & group == group[_n-1]
drop G_tmp

gen language_level_group =   uniform()
sort group language_level_group
bysort group: egen first = min(language_level_group)
bysort group: egen last = max(language_level_group)
gen language_level = floor((4)*runiform() + 1) if language_level_group == first
gen language_level_first_tmp = language_level if language_level != .
egen language_level_first = max(language_level_first_tmp), by(group)
drop language_level_first_tmp 

gen language_level_tmp = floor((3)*runiform() + 1) if language_level_group == last
replace language_level = 2 if language_level_first == 1 & language_level_tmp == 1
replace language_level = 3 if language_level_first == 1 & language_level_tmp == 2
replace language_level = 4 if language_level_first == 1 & language_level_tmp == 3
replace language_level = 1 if language_level_first == 2 & language_level_tmp == 1
replace language_level = 3 if language_level_first == 2 & language_level_tmp == 2
replace language_level = 4 if language_level_first == 2 & language_level_tmp == 3
replace language_level = 1 if language_level_first == 3 & language_level_tmp == 1
replace language_level = 2 if language_level_first == 3 & language_level_tmp == 2
replace language_level = 4 if language_level_first == 3 & language_level_tmp == 3
replace language_level = 1 if language_level_first == 4 & language_level_tmp == 1
replace language_level = 2 if language_level_first == 4 & language_level_tmp == 2
replace language_level = 3 if language_level_first == 4 & language_level_tmp == 3
drop language_level_group first last language_level_first language_level_tmp

gen G_tmp = "G3" if (G == "G1" | G == "G2") & language_level == .
replace G_tmp = "G" + string(floor((2)*runiform() + 1)) if G == "G3" & language_level == .
replace language_level = floor((4)*runiform() + 1) if language_level == .
replace G = G_tmp if G_tmp != ""
drop G_tmp

gen language_level_tmp = "S0" if language_level == 1
replace language_level_tmp = "S1" if language_level == 2
replace language_level_tmp = "S2" if language_level == 3
replace language_level_tmp = "S3" if language_level == 4

rename language_level language_level_tmp_tmp
rename language_level_tmp language_level

gen origin = "europe" if G == "G3"
replace origin = "noneurope" if origin == ""

gen newly_arrived  = 0
replace newly_arrived = 1 if G == "G1"

gen order = uniform() if G != "G3"
sort group order
egen order_min = min(order), by(group)
egen order_max = max(order), by(group)
gen first = order_min == order
gen second = order_max == order
gen country_num  = ceil(`num_countries' * uniform()) if first == 1

egen country_num_first = max(country_num), by(group)
gen country_num_sec  = ceil((`num_countries' - 1) * uniform()) if second == 1

replace country_num = 2 if country_num_first == 1 & country_num_sec == 1
replace country_num = 3 if country_num_first == 1 & country_num_sec == 2
replace country_num = 4 if country_num_first == 1 & country_num_sec == 3
replace country_num = 1 if country_num_first == 2 & country_num_sec == 1
replace country_num = 3 if country_num_first == 2 & country_num_sec == 2
replace country_num = 4 if country_num_first == 2 & country_num_sec == 3
replace country_num = 1 if country_num_first == 3 & country_num_sec == 1
replace country_num = 2 if country_num_first == 3 & country_num_sec == 2
replace country_num = 4 if country_num_first == 3 & country_num_sec == 3
replace country_num = 1 if country_num_first == 4 & country_num_sec == 1
replace country_num = 2 if country_num_first == 4 & country_num_sec == 2
replace country_num = 3 if country_num_first == 4 & country_num_sec == 3

replace country_num  = ceil(`num_countries' * uniform()) if G == "G3"

drop order order_min order_max first second country_num_first country_num_sec

gen country = ""
gen origin_city = ""
gen first_name = ""
gen last_name = ""
gen phone = ""
gen appl_email = ""

forvalues i = 1(1)`num_countries' {

 replace country = "`country_euro`i''" if G == "G3" & country_num == `i'
 replace origin_city =  "`city_euro`i''" if G == "G3"  & country_num == `i'
 replace last_name = "`last_euro_name`i''" if G == "G3"  & country_num == `i' 

 replace first_name = "`first_euro_male_name`i''" if G == "G3"  & country_num == `i' & male_occ == 1
 replace phone = "`phone_euro_male`i''" if G == "G3"  & country_num == `i' & male_occ == 1
 replace appl_email = "`email_euro_male`i''" if G == "G3"  & country_num == `i' & male_occ == 1
 
 replace first_name = "`first_euro_female_name`i''" if G == "G3"  & country_num == `i' & male_occ == 0
 replace phone = "`phone_euro_female`i''" if G == "G3"  & country_num == `i' & male_occ == 0
 replace appl_email = "`email_euro_female`i''" if G == "G3"  & country_num == `i' & male_occ == 0
 
 replace country = "`country_noneuro`i''" if G != "G3"  & country_num == `i'
 replace origin_city =  "`city_noneuro`i''" if G != "G3"  & country_num == `i'
 replace last_name = "`last_noneuro_name`i''" if G != "G3"  & country_num == `i'

 replace first_name = "`first_noneuro_male_name`i''" if G != "G3"  & country_num == `i'  & male_occ == 1
 replace phone = "`phone_noneuro_male`i''" if G != "G3"  & country_num == `i'  & male_occ == 1
 replace appl_email = "`email_noneuro_male`i''" if G != "G3"  & country_num == `i'  & male_occ == 1
 
 replace first_name = "`first_noneuro_female_name`i''" if G != "G3"  & country_num == `i' & male_occ == 0
 replace phone = "`phone_noneuro_female`i''" if G != "G3"  & country_num == `i' & male_occ == 0
 replace appl_email = "`email_noneuro_female`i''" if G != "G3"  & country_num == `i' & male_occ == 0
 
}

gen age = .
replace age = 25 if origin == "noneurope" & newly_arrived & !high_edu
replace age = 25 if origin == "noneurope" & !newly_arrived & !high_edu
replace age = 25 if origin == "europe" & !high_edu
replace age = 28 if origin == "noneurope" & newly_arrived & high_edu
replace age = 28 if origin == "noneurope" & !newly_arrived & high_edu
replace age = 28 if origin == "europe" & high_edu

gen rand = uniform()
bysort group: egen first = min(rand)
sort group rand
gen address_no = .
replace address_no = 1 if rand == first
replace address_no = 2 if address_no[_n-1] == 1
replace address_no = 3 if address_no[_n-1] == 2
gen address = ""

replace address = "`address1_sthlm'" if address_no == 1 & city == "Stockholm"
replace address = "`address1_gbg'" if address_no == 1 & city == "Göteborg"
replace address = "`address1_mlm'" if address_no == 1 & city == "Malmö"

replace address = "`address2_sthlm'" if address_no == 2 & city == "Stockholm"
replace address = "`address2_gbg'" if address_no == 2 & city == "Göteborg"
replace address = "`address2_mlm'" if address_no == 2 & city == "Malmö"

replace address = "`address3_sthlm'" if address_no == 3 & city == "Stockholm"
replace address = "`address3_gbg'" if address_no == 3 & city == "Göteborg"
replace address = "`address3_mlm'" if address_no == 3 & city == "Malmö"

drop rand first address_no

gen date_today = date("`year_today'-`month_today'-15", "YMD")
format date_today %tdCCYY-NN-DD
gen date_birth_tmp = date_today - age * 365.25
format date_birth_tmp %tdCCYY-NN-DD
gen days = ceil(364 * uniform())
gen date_birth = date_birth_tmp - days
format date_birth %tdCCYY-NN-DD
drop days date_birth_tmp

gen birth_year = year(date_birth)

gen date_start_primschool= date(string(birth_year + 7) + "0901", "YMD") 
format date_start_primschool %tdCCYY-NN-DD

gen years_in_primary_school = 9 
gen date_end_primschool = date(string(birth_year + 7 + years_in_primary_school) + "0610", "YMD") 
format date_end_primschool %tdCCYY-NN-DD

gen date_start_highschool = date_end_primschool
format date_start_highschool %tdCCYY-NN-DD

gen years_in_high_school = 3

gen date_end_highschool = date(string(birth_year + 7 + years_in_primary_school + years_in_high_school) + "0610", "YMD") 
format date_end_highschool %tdCCYY-NN-DD

gen date_start_univ = date_end_highschool
format date_start_univ %tdCCYY-NN-DD

gen years_in_univ = 3

gen date_end_univ = date(string(birth_year + 7 + years_in_primary_school + years_in_high_school + years_in_univ) + "0610", "YMD") 
format date_end_univ %tdCCYY-NN-DD

drop years_in_primary_school years_in_high_school years_in_univ

gen date_endwork1 = date_today
format date_endwork1 %tdCCYY-NN-DD 

gen date_startwork1 = date_endwork1 - 2 * 365.25 if newly_arrived == 0
replace date_startwork1 = date_endwork1 - 0.5 * 365.25 if newly_arrived == 1
gen days = ceil(120 * uniform()) - 60
replace date_startwork1 = date_startwork1 + days
format date_startwork1 %tdCCYY-NN-DD

foreach date_var of varlist date* {
 rename `date_var' `date_var'_old
 gen `date_var' = string(`date_var'_old, "%tdCCYY-NN-DD")
 drop `date_var'_old
}

sort occ app_row

gen appl_id = occ + "_y`year_today'_m`month_today'_no" + string(app_row)
drop app_row

gen date_start_edu2 = date_start_primschool
gen date_end_edu2 = date_end_primschool
replace date_start_edu2 = date_start_highschool if high_edu == 1
replace date_end_edu2 = date_end_highschool if high_edu == 1

gen date_start_edu1 = date_start_highschool 
gen date_end_edu1 = date_end_highschool 
replace date_start_edu1 = date_start_univ if high_edu == 1
replace date_end_edu1 = date_end_univ if high_edu == 1

drop  date_start_primschool date_end_primschool date_start_highschool date_end_highschool date_start_univ date_end_univ

order appl_id group appl_order appl_type occ occ_no high_edu language_level origin country origin_city first_name last_name gender newly_arrived age phone appl_email address birth_year date_today date_birth date_startwork1 date_endwork1 date_start_edu1 date_end_edu1 date_start_edu2 date_end_edu2 

save "Data/data_applications_simulation_balance", replace
